Libraries

library(tidyverse)
library(naniar)
library(visdat)
library(lubridate)
library(ggridges)
library(htmltools)

# install.packages("devtools")
#devtools::install_github("dill/emoGG")
library(emoGG)
library(leaflet) # leaflet map

Data

Reading in and exploring the data.

longline <- readxl::read_xlsx("data/longline97-2008.xlsx")

longline97-2008.csv

Data Dictionary:

Variable Description
identifier Unique Vessel Identifier
vessel_code Vessel Code
vessel_code2 Vessel Code (taking into account company?)
vessel_code Name of Vessel
gross_tonnage Gross tonnage of the vessel
vessel_length Length of the Vessel (m)
vessel_width Vessel Width
stanchion Stanchions on the vessel
primary_engine_hp Horsepower of the primary engine
second_engine_hp Horsepower of the second engine
carrying_capacity The carrying capacity of the vessel (units?)
year_constructed The year the vessel was constructed
vessel_criteria The criteria for the vessel
vessel_classification The classification of the vessel

Exploring the vessels

  1. How many vessels are in the data set? What is the vessel code?

Vessel Code

longline %>%
  distinct(vessel_code)
## # A tibble: 10 × 1
##    vessel_code
##          <dbl>
##  1      400062
##  2      400065
##  3      400075
##  4      400082
##  5      400088
##  6      400118
##  7      400061
##  8      400081
##  9      400133
## 10      400512

Vessel Code2

longline %>%
  distinct(vessel_code2)
## # A tibble: 10 × 1
##    vessel_code2
##           <dbl>
##  1       400062
##  2       400065
##  3       400075
##  4       400082
##  5       400088
##  6       400118
##  7       400061
##  8       400081
##  9       400133
## 10           NA
  1. How complete is the vessel data?
longline %>%
  select(vessel_code, 
         vessel_code2, 
         vessel_code, 
         gross_tonnage, 
         vessel_length, 
         vessel_width, 
         primary_engine_hp, 
         second_engine_hp, 
         carrying_capacity) %>%
  visdat::vis_miss()

  1. What are their dimensions (length, width, gross tonnage) and carrying capacity? What is the relationship between these variables?
longline %>%
  group_by(vessel_code) %>%
  distinct(vessel_code, vessel_length, vessel_width, gross_tonnage, carrying_capacity) %>%
  ggplot(aes(x = vessel_width, y = vessel_length, label = vessel_code)) +
  geom_point(aes(size = gross_tonnage)) +
  geom_text(check_overlap = TRUE, 
            size = 3,
            hjust = 0, 
            nudge_x = 0.1) +
  lims(x = c(8, 10.5)) +
  labs(x = "Vessel Width (m)", 
       y = "Vessel Length (m)",
       size = "Gross Tonnage (tonnes)")

longline %>%
  group_by(vessel_code) %>%
  distinct(vessel_code, vessel_length, vessel_width, gross_tonnage, carrying_capacity) %>%
  mutate(height = carrying_capacity,
         width = carrying_capacity) %>%
  ggplot(aes(x = vessel_width, y = vessel_length, label = vessel_code, size = I(carrying_capacity/6000))) +
  geom_emoji(emoji = "26f4", show.legend = TRUE) +
  geom_text(check_overlap = TRUE, 
            size = 3,
            hjust = 0, 
            nudge_x = 0.1) +
  lims(x = c(8, 10.5)) +
  labs(x = "Vessel Width (m)", 
       y = "Vessel Length (m)",
       size = "Carrying Capacity")

Observations:

  1. What is their vessel classification?
longline %>%
  select(vessel_code, 
         gross_tonnage, 
         vessel_length, 
         vessel_width, 
         vessel_classification, 
         year_constructed) %>%
  visdat::vis_miss()

unique(longline$vessel_classification)
## [1] "NA"     NA       "casual"

Observations:

longline %>%
  group_by(vessel_code) %>%
  distinct(vessel_code, vessel_length, vessel_width, gross_tonnage, carrying_capacity, year_constructed) 
## # A tibble: 10 × 6
## # Groups:   vessel_code [10]
##    vessel_code gross_tonnage vessel_length vessel_width carrying_capacity
##          <dbl>         <dbl>         <dbl>        <dbl>             <dbl>
##  1      400062          505.          44.5         8.4               350 
##  2      400065          753           53.4         9.5               200 
##  3      400075          537           46.8         8.5               571 
##  4      400082          292           26.6         8                  72 
##  5      400088          653.          48.5         9.2               364.
##  6      400118          752           52.2         9.5               550 
##  7      400061          465.          46.3         8.35              350 
##  8      400081          292           26.6         8                  72 
##  9      400133           NA           NA          NA                  NA 
## 10      400512           NA           NA          NA                  NA 
## # … with 1 more variable: year_constructed <dbl>
  1. When were the vessels constructed?
longline %>%
  group_by(vessel_code) %>%
  distinct(vessel_code, vessel_length, vessel_width, gross_tonnage, carrying_capacity, year_constructed) %>%
  ggplot(aes(x = year_constructed, y = gross_tonnage, label = vessel_code)) +
  geom_point(aes(size = carrying_capacity)) +
  geom_text(check_overlap = TRUE, 
            size = 3,
            hjust = 0, 
            nudge_x = 1) +
  labs(x = "Year Constructed", 
       y = "Gross Tonnage (tonnes)",
       size = "Carrying Capacity") +
  xlim(1965, 2000)

  1. When were the vessels active?
longline <- longline %>%
  mutate(trip_return_date_clean = lubridate::as_datetime("1900-01-01") + days(round(trip_return_date)),
         trip_leaving_date_clean = lubridate::as_datetime("1900-01-01") + days(round(trip_leaving_date)),
         trip_duration = trip_return_date_clean - trip_leaving_date_clean,
         vessel_code = as.factor(vessel_code)) # Not sure here what to use for the origin for the date.
longline %>%
  distinct(vessel_code, trip_number, .keep_all = TRUE) %>%
  ggplot() +
  geom_point(aes(x = trip_leaving_date_clean, y = vessel_code)) +
  geom_point(aes(x = trip_return_date_clean, y = vessel_code)) +
  labs(y = "Vessel Code", x = "Date")

Observations

  1. How did the duration of trips vary by boat? by year?
longline %>%
  distinct(vessel_code, trip_number, .keep_all = TRUE) %>%
  ggplot(aes(x = trip_duration, y = as.factor(vessel_code), fill = as.factor(vessel_code))) + 
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Trip Duration (Days)", 
       fill = "Vessel Code")

What were the trip statistics (number of trips made and the minimum, mean, and maximum trip duration)?

longline %>%
  group_by(vessel_code) %>%
  select(vessel_code, trip_number, trip_duration, haul_number) %>%
  distinct(vessel_code, trip_number, haul_number, .keep_all = TRUE) %>%
  summarise(max_number_trips = max(trip_number),
            max_trip_duration = max(trip_duration),
            min_trip_duration = min(trip_duration),
            mean_trip_duration = round(mean(trip_duration), 2),
            max_hauls = max(haul_number))
## # A tibble: 10 × 6
##    vessel_code max_number_trips max_trip_duration min_trip_duration
##    <fct>                  <dbl> <drtn>            <drtn>           
##  1 400061                    10  63 days          19 days          
##  2 400062                    11  60 days           8 days          
##  3 400065                     8  73 days          33 days          
##  4 400075                     8  88 days          19 days          
##  5 400081                    28  27 days           5 days          
##  6 400082                    29 161 days           5 days          
##  7 400088                     4  67 days          60 days          
##  8 400118                     8  91 days          24 days          
##  9 400133                    14   9 days           4 days          
## 10 400512                     8   6 days           5 days          
## # … with 2 more variables: mean_trip_duration <drtn>, max_hauls <dbl>

Doublecheck your workflow for this one to see if it makes sense.

  1. How did the haul weight vary for each ship over time?
longline %>%
  filter(total_capture_kg > 0 & total_capture_kg < 20000) %>%
  ggplot(aes(x = total_capture_kg, y = vessel_code, fill = vessel_code)) + 
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Total Catch (1000 kg)", 
       fill = "Vessel Code") +
  facet_wrap(~Year) +
  scale_x_continuous(breaks = c(0, 5000, 10000, 15000, 20000), labels = c("0", "5", "10", "15", "20"))

Fishing effort in hauls over time

longline %>%
  group_by(Year, vessel_code, Month) %>%
  summarise(hauls = n()) %>%
  ggplot(aes(x = Month, 
             y = hauls, 
             group = vessel_code, 
             color = vessel_code)) +
  geom_line() +
  scale_color_viridis_d() + 
  coord_polar() +
  facet_wrap(~Year) +
  labs(color = "Vessel Code")

The same plot above without the coord_polar

longline %>%
  group_by(Year, vessel_code, Month) %>%
  summarise(hauls = n()) %>%
  ggplot(aes(x = Month, 
             y = hauls, 
             group = vessel_code, 
             color = vessel_code)) +
  geom_line() +
  scale_color_viridis_d() + 
  facet_wrap(~Year) +
  labs(color = "Vessel Code")

Create an interactive map of fishing haul locations

longline <- longline %>%
  mutate(startlondd = -startlondd,
         startlatdd = -startlatdd,
         endlondd = -endlondd, 
         endlatdd = -endlatdd,
         group = as.factor(paste(vessel_code, trip_number, haul_number)))
pal <- colorNumeric(
  palette = "Oranges",
  domain = longline$Genypterus_blacodes)


longline$labels <- sprintf("<strong>Vessel Code: %s</strong><br/>%g proportion of catch", 
                  longline$vessel_code, longline$Genypterus_blacodes) %>% lapply(htmltools::HTML)

(start_position_map <- leaflet(data = longline) %>%
  addProviderTiles(providers$Esri.OceanBasemap) %>%
  addCircleMarkers(data = longline[longline$vessel_code == 400062,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400062,]$labels,
                   radius = 2,
                   group = "Vessel 400062") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400065,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400065,]$labels,
                   radius = 2,
                   group = "Vessel 400065") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400075,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400075,]$labels,
                   radius = 2,
                   group = "Vessel 400075") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400082,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400082,]$labels,
                   radius = 2,
                   group = "Vessel 400082") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400088,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400088,]$labels,
                   radius = 2,
                   group = "Vessel 400088") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400118,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400118,]$labels,
                   radius = 2,
                   group = "Vessel 400118") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400061,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400061,]$labels,
                   radius = 2,
                   group = "Vessel 400061") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400081,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400081,]$labels,
                   radius = 2,
                   group = "Vessel 400081") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400133,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400133,]$labels,
                   radius = 2,
                   group = "Vessel 400133") %>%
    addCircleMarkers(data = longline[longline$vessel_code == 400512,], 
                   lng = ~startlondd, 
                   lat = ~startlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes), 
                   label = longline[longline$vessel_code == 400512,]$labels,
                   radius = 2,
                   group = "Vessel 400512") %>%
    addLegend(position = "bottomright",
              pal = pal,
              values = ~Genypterus_blacodes,
              title = "Proportion of catch that is pink cusk-eel",
              opacity = 1)) %>%
    addLayersControl(
    overlayGroups = c("Vessel 400065", "Vessel 400062", "Vessel 400075", 
                      "Vessel 400082", "Vessel 400088", "Vessel 400118",
                      "Vessel 400061", "Vessel 400081", "Vessel 400133",
                      "Vessel 400512"),
    options = layersControlOptions(collapsed = FALSE)
    )
(end_position_map <- leaflet(data = longline) %>%
  addProviderTiles(providers$Esri.OceanBasemap) %>%
  addCircleMarkers(lng = ~endlondd, 
                   lat = ~endlatdd, 
                   clusterOptions = NULL, 
                   color = ~pal(Genypterus_blacodes),
                   label = longline$labels,
                   radius = 2) %>%
    addLegend(position = "bottomright",
              pal = pal,
              values = ~Genypterus_blacodes,
              title = "Proportion of catch that is pink cusk-eel",
              opacity = 1))

Catch Composition: What are the most common species?

What are the highest proportions of species across the different hauls?

longline %>%
  select(vessel_code, trip_number, leaving_port, landing_port, haul_number, total_catch, starts_with(match = "P_")) %>%
  pivot_longer(cols = starts_with(match = "P_"), names_to = "prop_species", values_to = "proportion") %>%
  filter(proportion > 0) %>%
  group_by(vessel_code, trip_number) %>%
  slice_max(order_by = proportion, n = 10) %>%
  ungroup() %>%
  select(prop_species) %>%
  unique()
## # A tibble: 33 × 1
##    prop_species
##    <chr>       
##  1 P_reineta   
##  2 P_varios    
##  3 P_cojinoba.2
##  4 P_jurel     
##  5 P_tollo3    
##  6 P_merluza1  
##  7 P_dorado_   
##  8 P_brotula   
##  9 P_marrajo   
## 10 P_bacalao.1 
## # … with 23 more rows
  1. How often do species come up in the hauls? Presence/Absence
longline %>%
  select(vessel_code, trip_number, leaving_port, landing_port, haul_number, total_catch, Merluccius_australis, Genypterus_blacodes, starts_with(match = "P_")) %>%
  pivot_longer(cols = c(Merluccius_australis, starts_with(match = "P_"), Genypterus_blacodes), names_to = "prop_species", values_to = "proportion") %>%
  filter(proportion > 0) %>%
  group_by(prop_species) %>%
  summarise(n = n()) %>%
  arrange(desc(n))
## # A tibble: 77 × 2
##    prop_species             n
##    <chr>                <int>
##  1 Genypterus_blacodes  11329
##  2 Merluccius_australis 11075
##  3 P_bacalao.1           4787
##  4 P_cojinoba.2          2634
##  5 P_varios              2150
##  6 P_reineta             1533
##  7 P_brotula             1067
##  8 P_cab2                 410
##  9 P_merluza1             359
## 10 P_coji                 265
## # … with 67 more rows

How does the weight of congrio dorado and merluza_comun/meluza_del_sur vary in hauls?

longline %>%
  filter(total_capture_kg > 0 & total_capture_kg < 20000, vessel_name != "ISLA SOFIA") %>%
  select(vessel_code, trip_number, leaving_port, landing_port, haul_number, total_catch, merluza_comun, congrio_dorado, merluza_del_sur, bacalao_de_profundidad, Year) %>%
  pivot_longer(cols = c(merluza_comun, congrio_dorado, merluza_del_sur, bacalao_de_profundidad), names_to = "species", values_to = "weight_kg") %>%
  ggplot(aes(x = weight_kg, y = vessel_code, fill = species)) +
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Weight (kg)", 
       fill = "Species") #+

  #scale_x_continuous(breaks = c(0, 5000, 10000, 15000, 20000), labels = c("0", "5", "10", "15", "20"))

How complete is the fishing information?

visdat::vis_dat(longline[, c("J_time_line_set", "J_time_laying_line", "J_time_start_haul", "J_time_end_haul", "c_laying_time", "c_soak_time", "c_haul_time", "min_depth", "max_depth", "mid_depth", "max_min", "hook_type", "number_of_hooks", "bait", "hook_size")])

What is the range in the soak time?

longline %>%
  filter(total_capture_kg > 0 & total_capture_kg < 20000, vessel_name != "ISLA SOFIA") %>%
  ggplot(aes(x = c_soak_time, y = vessel_code)) +
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Soak Time (hours)") 

What is the range in haul time?

longline %>%
  filter(total_capture_kg > 0 & total_capture_kg < 20000, vessel_name != "ISLA SOFIA") %>%
  ggplot(aes(x = c_haul_time, y = vessel_code)) +
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Haul Time (hours)") 

What is the range in the min depth?

longline %>%
  filter(total_capture_kg > 0 & total_capture_kg < 20000, vessel_name != "ISLA SOFIA") %>%
  ggplot(aes(x = min_depth, y = vessel_code)) +
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Min Depth (m)") 

What is the range in the max depth?

longline %>%
  filter(total_capture_kg > 0 & total_capture_kg < 20000, vessel_name != "ISLA SOFIA") %>%
  ggplot(aes(x = max_depth, y = vessel_code)) +
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Max Depth (m)") +
  facet_wrap(~man_zone)

What is the range in the mid depth?

longline %>%
  filter(total_capture_kg > 0 & total_capture_kg < 20000, vessel_name != "ISLA SOFIA") %>%
  ggplot(aes(x = mid_depth, y = vessel_code)) +
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Mid Depth (m)") +
  facet_wrap(~man_zone)

Looking at the hauls and filtering out only the groups that have the most:

longline %>%
  select(vessel_code, vessel_name, trip_number, leaving_port, landing_port, haul_number, total_catch, Merluccius_australis, Genypterus_blacodes, starts_with(match = "P_"), man_zone) %>%
  pivot_longer(cols = c(Merluccius_australis, starts_with(match = "P_"), Genypterus_blacodes), names_to = "prop_species", values_to = "proportion") %>%
  filter(proportion > 0, prop_species %in% c("Genypterus_blacodes", "Merluccius_australis", "P_bacalao.1", "P_cojinoba.2", "P_reineta", "P_brotula", "P_cab2", "P_merluza1", "P_coji"), vessel_name != "ISLA SOFIA") %>%
  ggplot(aes(x = proportion, y = vessel_code, fill = prop_species)) +
  geom_density_ridges(alpha = 0.75) +
  scale_fill_viridis_d() +
  labs(y = "Vessel Code",
       x = "Proportion of Catch") +
  facet_wrap(~man_zone)